{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "181b5b6d",
   "metadata": {},
   "source": [
    "# XML parser\n",
    "This output parser allows users to obtain results from LLM in the popular XML format. \n",
    "\n",
    "Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed XML. \n",
    "\n",
    "In the following example we use Claude model (https://docs.anthropic.com/claude/docs) which works really well with XML tags."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3b10fc55",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.output_parsers import XMLOutputParser\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain_community.chat_models import ChatAnthropic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "909161d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = ChatAnthropic(model=\"claude-2\", max_tokens_to_sample=512, temperature=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da312f86-0d2a-4aef-a09d-1e72bd0ea9b1",
   "metadata": {},
   "source": [
    "Let's start with the simple request to the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b03785af-69fc-40a1-a1be-c04ed6fade70",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Here is the shortened filmography for Tom Hanks, enclosed in XML tags:\n",
      "\n",
      "<movie>Splash</movie>\n",
      "<movie>Big</movie>\n",
      "<movie>A League of Their Own</movie>\n",
      "<movie>Sleepless in Seattle</movie>\n",
      "<movie>Forrest Gump</movie>\n",
      "<movie>Toy Story</movie>\n",
      "<movie>Apollo 13</movie>\n",
      "<movie>Saving Private Ryan</movie>\n",
      "<movie>Cast Away</movie>\n",
      "<movie>The Da Vinci Code</movie>\n",
      "<movie>Captain Phillips</movie>\n"
     ]
    }
   ],
   "source": [
    "actor_query = \"Generate the shortened filmography for Tom Hanks.\"\n",
    "output = model.invoke(\n",
    "    f\"\"\"{actor_query}\n",
    "Please enclose the movies in <movie></movie> tags\"\"\"\n",
    ")\n",
    "print(output.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4db65781-3d54-4ba6-ae26-5b4ead47a4c8",
   "metadata": {},
   "source": [
    "Now we will use the XMLOutputParser in order to get the structured output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "87ba8d11",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'filmography': [{'movie': [{'title': 'Big'}, {'year': '1988'}]}, {'movie': [{'title': 'Forrest Gump'}, {'year': '1994'}]}, {'movie': [{'title': 'Toy Story'}, {'year': '1995'}]}, {'movie': [{'title': 'Saving Private Ryan'}, {'year': '1998'}]}, {'movie': [{'title': 'Cast Away'}, {'year': '2000'}]}]}\n"
     ]
    }
   ],
   "source": [
    "parser = XMLOutputParser()\n",
    "\n",
    "prompt = PromptTemplate(\n",
    "    template=\"\"\"{query}\\n{format_instructions}\"\"\",\n",
    "    input_variables=[\"query\"],\n",
    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
    ")\n",
    "\n",
    "chain = prompt | model | parser\n",
    "\n",
    "output = chain.invoke({\"query\": actor_query})\n",
    "print(output)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "327f5479-77e0-4549-8393-2cd7a286d491",
   "metadata": {},
   "source": [
    "Finally, let's add some tags to tailor the output to our needs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b722a235",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'movies': [{'actor': [{'name': 'Tom Hanks'}, {'film': [{'name': 'Forrest Gump'}, {'genre': 'Drama'}]}, {'film': [{'name': 'Cast Away'}, {'genre': 'Adventure'}]}, {'film': [{'name': 'Saving Private Ryan'}, {'genre': 'War'}]}]}]}\n"
     ]
    }
   ],
   "source": [
    "parser = XMLOutputParser(tags=[\"movies\", \"actor\", \"film\", \"name\", \"genre\"])\n",
    "prompt = PromptTemplate(\n",
    "    template=\"\"\"{query}\\n{format_instructions}\"\"\",\n",
    "    input_variables=[\"query\"],\n",
    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
    ")\n",
    "\n",
    "\n",
    "chain = prompt | model | parser\n",
    "\n",
    "output = chain.invoke({\"query\": actor_query})\n",
    "\n",
    "print(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "808a5df5-b11e-42a0-bd7a-6b95ca0c3eba",
   "metadata": {},
   "outputs": [
    {
     "ename": "ParseError",
     "evalue": "syntax error: line 1, column 1 (<string>)",
     "output_type": "error",
     "traceback": [
      "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
      "\u001b[0m  File \u001b[1;32m~/.pyenv/versions/3.10.1/envs/langchain/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3508\u001b[0m in \u001b[1;35mrun_code\u001b[0m\n    exec(code_obj, self.user_global_ns, self.user_ns)\u001b[0m\n",
      "\u001b[0m  Cell \u001b[1;32mIn[7], line 1\u001b[0m\n    for s in chain.stream({\"query\": actor_query}):\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1984\u001b[0m in \u001b[1;35mstream\u001b[0m\n    yield from self.transform(iter([input]), config, **kwargs)\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1974\u001b[0m in \u001b[1;35mtransform\u001b[0m\n    yield from self._transform_stream_with_config(\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1141\u001b[0m in \u001b[1;35m_transform_stream_with_config\u001b[0m\n    for chunk in iterator:\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1938\u001b[0m in \u001b[1;35m_transform\u001b[0m\n    for output in final_pipeline:\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/output_parsers/transform.py:50\u001b[0m in \u001b[1;35mtransform\u001b[0m\n    yield from self._transform_stream_with_config(\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1141\u001b[0m in \u001b[1;35m_transform_stream_with_config\u001b[0m\n    for chunk in iterator:\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/workplace/langchain/libs/core/langchain_core/output_parsers/xml.py:71\u001b[0m in \u001b[1;35m_transform\u001b[0m\n    for event, elem in parser.read_events():\u001b[0m\n",
      "\u001b[0m  File \u001b[1;32m~/.pyenv/versions/3.10.1/lib/python3.10/xml/etree/ElementTree.py:1329\u001b[0m in \u001b[1;35mread_events\u001b[0m\n    raise event\u001b[0m\n",
      "\u001b[0;36m  File \u001b[0;32m~/.pyenv/versions/3.10.1/lib/python3.10/xml/etree/ElementTree.py:1301\u001b[0;36m in \u001b[0;35mfeed\u001b[0;36m\n\u001b[0;31m    self._parser.feed(data)\u001b[0;36m\n",
      "\u001b[0;36m  File \u001b[0;32m<string>\u001b[0;36m\u001b[0m\n\u001b[0;31mParseError\u001b[0m\u001b[0;31m:\u001b[0m syntax error: line 1, column 1\n"
     ]
    }
   ],
   "source": [
    "for s in chain.stream({\"query\": actor_query}):\n",
    "    print(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efc073c6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
